Try to analyse tabula muris data Read it in first
require("Matrix")
library(DropletUtils)
library(scater)
library(scran)
library(EnsDb.Mmusculus.v79)
#setwd("/Volumes/SlowDrive/NGSdata/tenX_genomics/tabulaMuris/")
setwd("/Users/brown.d/Data/scRNA_seq/")
The working directory was changed to /Users/brown.d/Data/scRNA_seq inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
sce = readRDS("marrow_droplet.rds")
sce
class: SingleCellExperiment
dim: 23433 4112
metadata(0):
assays(1): counts
rownames(23433): Xkr4 Rp1 ... Tdtom_transgene zsGreen_transgene
rowData names(0):
colnames(4112): 10X_P7_2_AAACCTGCAGTAACGG-1 10X_P7_2_AAACGGGAGAAGAAGC-1 ... 10X_P7_3_TTTGTCATCCGCATAA-1
10X_P7_3_TTTGTCATCTGGGCCA-1
colData names(3): mouse type batch
reducedDimNames(0):
spikeNames(0):
I don’t need to convert the gene names here as it is already done. Get the mitochondrial gene location. Be careful of geneID vs gene symbol. Here I immediately get symbol
location <- mapIds(EnsDb.Mmusculus.v79, keys=row.names(sce),
column="SEQNAME", keytype="SYMBOL")
rowData(sce)$CHR <- location
summary(location=="MT")
Mode FALSE NA's
logical 21295 2138
I guess the mitochondrial genes were removed. Separate the cell drops from the empty drops
# Set the empty drop threshold to 100 then perform a statistical test (FDR 0.01) to call cell drops from the empties
set.seed(100) # Monte Carlo test is used therefore set seed
e.out <- emptyDrops(counts(sce), lower = 100)
sum(e.out$FDR <= 0.01, na.rm=TRUE)
[1] 3039
sum(e.out$FDR >= 0.01, na.rm=TRUE)
[1] 1073
# The number of empty drops is printed
table(Sig=e.out$FDR <= 0.01, Limited=e.out$Limited)
Limited
Sig FALSE
FALSE 1073
TRUE 3039
# Limited: = Logical, indicating whether a lower p-value could be obtained by increasing npts.
# Subset the single cell experiment so it only contains cells
sce <- sce[,which(e.out$FDR <= 0.01)]
There are just over 3 thousand cells in the experiment Now we do some quality control on the cells
#### Quality control on the cells ####
# This is a very relaxed filter, Aaron recommends going back and forth from clustering to make QC tighter
sce <- calculateQCMetrics(sce, feature_controls=list(Mito=which(location=="MT")))
par(mfrow=c(2,1))
hist(sce$total_counts, breaks=20, col="grey80",
xlab="Log-total UMI count")
hist(sce$total_features_by_counts, breaks=20, col="grey80",
xlab="Log-total number of expressed features")
The number of genes per cell at just overr 2000 seems normal for a scRNA-seq experiment Lets look at highly expressed genes
ave <- calcAverage(sce)
rowData(sce)$AveCount <- ave
par(mfrow=c(1,1))
hist(log10(ave), col="grey80")
#plotHighestExprs(sce)
The average expression of many genes appears to be less than 1. Is this typical of scRNA-seq or is this a problem with the data? Lets continue with normalisation
# Perform preclustering before normalisation to avoid grouping cells that are too different together in the normalisation
clusters <- quickCluster(sce, method="igraph", min.mean=0.1,
irlba.args=list(maxit=1000)) # for convergence.
table(clusters)
clusters
1 2 3 4 5
410 322 424 1412 471
sce <- computeSumFactors(sce, min.mean=0.1, cluster=clusters)
summary(sizeFactors(sce))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.01289 0.36695 0.72254 1.00000 1.32812 17.96713
# Look at the correlation between library size and the scaling factor used in normalisation
plot(sce$total_counts, sizeFactors(sce), log="xy")
sce <- normalize(sce)
#### Modelling the mean-variance trend ####
# Fit a trend on technical noise using a poisson distribution
new.trend <- makeTechTrend(x=sce)
fit <- trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
plot(fit$mean, fit$var, pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE)
curve(new.trend(x), col="red", add=TRUE)
# We decompose the variance for each gene using the Poisson-based trend
# examine the genes with the highest biological components.
fit0 <- fit
fit$trend <- new.trend
dec <- decomposeVar(fit=fit)
top.dec <- dec[order(dec$bio, decreasing=TRUE),]
head(top.dec)
DataFrame with 6 rows and 6 columns
mean total bio tech p.value FDR
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
Beta-s 7.08460989170765 22.1036927332672 21.8489692689798 0.254723464287441 0 0
Hbb-b2 2.23288677247864 14.9422224525611 13.5640616182319 1.37816083432923 0 0
Alas2 1.93251301554979 13.4523683686937 12.1648500192596 1.28751834943416 0 0
S100a8 6.73725190662508 12.2619393107751 11.8735184464179 0.388420864357245 0 0
Ngp 4.57717822025739 12.5615383938966 11.0692356585205 1.49230273537606 0 0
S100a9 6.51555087629298 11.3059282114114 10.8125957345078 0.493332476903561 0 0
# Plot genes with highest biological variation
plotExpression(sce, features=rownames(top.dec)[1:10])
The are 2 patterns of variable genes. All or nothing like Hbb-b2 or multimodal like s100a8 Lets do some dimension reduction For some reason I am always getting an error that the chunk files can’t be found. By setting cache to false the plots are produced.
sce <- denoisePCA(sce, technical=new.trend, approx=TRUE)
ncol(reducedDim(sce, "PCA"))
[1] 10
plot(attr(reducedDim(sce), "percentVar"), xlab="PC",
ylab="Proportion of variance explained")
abline(v=ncol(reducedDim(sce, "PCA")), lty=2, col="red")
Plot the PCA representation
plotPCA(sce, ncomponents=3, colour_by="log10_total_features_by_counts")
There is some structure in the data especially in dimension 1
# DO a t-SNE on the PCA reduced spaceq()
sce <- runTSNE(sce, use_dimred="PCA", perplexity=30, rand_seed=100)
'rand.seed=' is deprecated.
Use 'set.seed' externally instead.
plotTSNE(sce, colour_by="log10_total_features_by_counts")
'add_ticks' is deprecated.
Use '+ geom_rug(...)' instead.
t-SNE looks OK lets try UMAP
set.seed(100)
sce = runUMAP(sce, ncomponents = 2, scale_features = TRUE, use_dimred = "PCA", n_dimred = 10)
plotUMAP(sce, colour_by="log10_total_features_by_counts")
'add_ticks' is deprecated.
Use '+ geom_rug(...)' instead.
UMAP does indeed look nicer. Lets do some clustering
snn.gr <- buildSNNGraph(sce, use.dimred="PCA")
clusters <- igraph::cluster_walktrap(snn.gr)
sce$Cluster <- factor(clusters$membership)
table(sce$Cluster)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
205 141 144 205 78 139 192 120 382 37 150 291 119 134 174 112 41 42 36 70 29 31 27 16 96 28
cluster.mod <- clusterModularity(snn.gr, sce$Cluster, get.values=TRUE)
log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)
library(pheatmap)
pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE,
color=colorRampPalette(c("white", "blue"))(100))
plotUMAP(sce, colour_by="Cluster")
'add_ticks' is deprecated.
Use '+ geom_rug(...)' instead.
markers <- findMarkers(sce, clusters=sce$Cluster, direction="up")
marker.set <- markers[["2"]]
head(marker.set[,1:8], 10) # only first 8 columns, for brevity
DataFrame with 10 rows and 8 columns
Top p.value FDR logFC.1 logFC.3
<integer> <numeric> <numeric> <numeric> <numeric>
Xist 1 7.67650412105267e-116 1.79883521068626e-111 0.530455101614717 0.478041972200618
Eef1a1 1 1.80715957090427e-92 2.11735851124998e-88 0.437115228684105 0.568391129749194
Ptma 1 8.52564145756482e-89 4.99453390687787e-85 0.650988125404034 -1.61691429411616
Hmgb1 1 6.42766287196884e-87 2.77262931960248e-83 1.02927870262637 -2.05807636763905
Srgn 1 1.18642217207871e-79 1.63537827990119e-76 1.4637510940961 0.879088221372878
Gmfg 1 3.51948623787601e-78 4.58178450067491e-75 0.104353999621912 0.241571698271296
Hsp90ab1 1 2.14031613290255e-73 1.72944923938986e-70 0.828261998593806 0.568688767387315
Ptpn18 1 4.21073686362566e-70 2.29465574244978e-67 -0.281808390893418 0.820365190251775
Cct2 1 6.7294453035116e-65 2.50303320312994e-62 0.399107884507495 -0.0400780023969025
Ifitm2 1 1.13897811621615e-63 3.92495208783717e-61 0.663467388649869 2.20883739851321
logFC.4 logFC.5 logFC.6
<numeric> <numeric> <numeric>
Xist 0.891198178355348 0.834964797395584 2.84221687708966
Eef1a1 1.41436692385796 1.48139985490674 2.76415051376123
Ptma -0.111855430668029 0.507426961201439 4.37466828876474
Hmgb1 -0.199704165280691 -0.835889886348565 2.65717637262872
Srgn 1.83975798680507 2.90709877018893 1.90890939554896
Gmfg -0.707216141005549 2.41305736881803 -1.97349261031347
Hsp90ab1 2.00965330047302 2.5972618116879 2.4393696782989
Ptpn18 -0.00543916402455435 2.10591613047892 -0.306372960064296
Cct2 0.743148455719703 0.936866724291169 1.34010299366871
Ifitm2 0.311389489123578 2.28704950907702 -0.618286005254992
chosen <- rownames(marker.set)[marker.set$Top <= 10]
plotHeatmap(sce, features=chosen, exprs_values="logcounts",
zlim=5, center=TRUE, symmetric=TRUE, cluster_cols=FALSE,
colour_columns_by="Cluster", columns=order(sce$Cluster))